# Always print this out before your assignment
sessionInfo()
R version 4.1.1 (2021-08-10)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 18362)
Matrix products: default
locale:
[1] LC_COLLATE=English_United States.1252 LC_CTYPE=English_United States.1252
[3] LC_MONETARY=English_United States.1252 LC_NUMERIC=C
[5] LC_TIME=English_United States.1252
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] corrplot_0.92 ggridges_0.5.3 glmnetUtils_1.1.8 glmnet_4.1-2
[5] Matrix_1.3-4 scales_1.1.1 tidyquant_1.0.3 quantmod_0.4.18
[9] TTR_0.24.2 PerformanceAnalytics_2.0.4 xts_0.12.1 zoo_1.8-9
[13] plotly_4.10.0 viridis_0.6.2 viridisLite_0.4.0 pastecs_1.3.21
[17] kableExtra_1.3.4 lubridate_1.8.0 rsample_0.1.0 ggthemes_4.2.4
[21] ggrepel_0.9.1 here_1.0.1 fs_1.5.0 forcats_0.5.1
[25] stringr_1.4.0 dplyr_1.0.7 purrr_0.3.4 readr_2.0.2
[29] tidyr_1.1.4 tibble_3.1.5 ggplot2_3.3.5 tidyverse_1.3.1
[33] knitr_1.36
loaded via a namespace (and not attached):
[1] colorspace_2.0-2 ellipsis_0.3.2 rprojroot_2.0.2 rstudioapi_0.13 listenv_0.8.0 furrr_0.2.3
[7] farver_2.1.0 bit64_4.0.5 fansi_0.5.0 xml2_1.3.2 splines_4.1.1 codetools_0.2-18
[13] jsonlite_1.7.2 broom_0.7.9 dbplyr_2.1.1 compiler_4.1.1 httr_1.4.2 backports_1.3.0
[19] assertthat_0.2.1 fastmap_1.1.0 lazyeval_0.2.2 cli_3.0.1 htmltools_0.5.2 tools_4.1.1
[25] gtable_0.3.0 glue_1.4.2 Rcpp_1.0.7 cellranger_1.1.0 jquerylib_0.1.4 vctrs_0.3.8
[31] svglite_2.0.0 iterators_1.0.13 crosstalk_1.2.0 xfun_0.27 globals_0.14.0 rvest_1.0.2
[37] lifecycle_1.0.1 pacman_0.5.1 future_1.22.1 vroom_1.5.5 hms_1.1.1 parallel_4.1.1
[43] yaml_2.2.1 curl_4.3.2 gridExtra_2.3 sass_0.4.0 stringi_1.7.5 highr_0.9
[49] foreach_1.5.1 boot_1.3-28 shape_1.4.6 rlang_0.4.11 pkgconfig_2.0.3 systemfonts_1.0.3
[55] evaluate_0.14 lattice_0.20-44 htmlwidgets_1.5.4 labeling_0.4.2 bit_4.0.4 tidyselect_1.1.1
[61] parallelly_1.28.1 plyr_1.8.6 magrittr_2.0.1 R6_2.5.1 generics_0.1.0 DBI_1.1.1
[67] pillar_1.6.3 haven_2.4.3 withr_2.4.2 survival_3.2-11 modelr_0.1.8 crayon_1.4.1
[73] Quandl_2.11.0 utf8_1.2.2 tzdb_0.1.2 rmarkdown_2.11 grid_4.1.1 readxl_1.3.1
[79] data.table_1.14.2 reprex_2.0.1 digest_0.6.28 webshot_0.5.2 munsell_0.5.0 bslib_0.3.1
[85] quadprog_1.5-8
getwd()
[1] "C:/Users/cabrooke/Documents/GitHub/BROCODE_Final_Project"
# load all your libraries in this chunk
library('tidyverse')
library("fs")
library('here')
library('dplyr')
library('tidyverse')
library('ggplot2')
library('ggrepel')
library('ggthemes')
library('forcats')
library('rsample')
library('lubridate')
library('ggthemes')
library('kableExtra')
library('pastecs')
library('viridis')
library('plotly')
library('tidyquant')
library('scales')
library("gdata")
gdata: Unable to locate valid perl interpreter
gdata:
gdata: read.xls() will be unable to read Excel XLS and XLSX files unless the 'perl=' argument is used
gdata: to specify the location of a valid perl intrpreter.
gdata:
gdata: (To avoid display of this message in the future, please ensure perl is installed and available
gdata: on the executable search path.)
gdata: Unable to load perl libaries needed by read.xls()
gdata: to support 'XLX' (Excel 97-2004) files.
gdata: Unable to load perl libaries needed by read.xls()
gdata: to support 'XLSX' (Excel 2007+) files.
gdata: Run the function 'installXLSXsupport()'
gdata: to automatically download and install the perl
gdata: libaries needed to support Excel XLS and XLSX formats.
Attaching package: ‘gdata’
The following objects are masked from ‘package:xts’:
first, last
The following objects are masked from ‘package:pastecs’:
first, last
The following objects are masked from ‘package:dplyr’:
combine, first, last
The following object is masked from ‘package:purrr’:
keep
The following object is masked from ‘package:stats’:
nobs
The following object is masked from ‘package:utils’:
object.size
The following object is masked from ‘package:base’:
startsWith
# note, do not run install.packages() inside a code chunk. install them in the console outside of a code chunk.
1a) Loading data
#Reading the data in and doing minor initial cleaning in the function call
#Reproducible data analysis should avoid all automatic string to factor conversions.
#strip.white removes white space
#na.strings is a substitution so all that have "" will = na
data <- read.csv(here::here("final_project", "donor_data.csv"),
stringsAsFactors = FALSE,
strip.white = TRUE,
na.strings = "")
1b) Fixing the wonky DOB & Data cleanup
#(Birthdate and Age, ID as a number)adding DOB (Age/Spouse Age) in years columns and adding two fields for assignment and number of children and number of degrees
dataclean <- data %>%
mutate(Birthdate = ifelse(Birthdate == "0001-01-01", NA, Birthdate)) %>%
mutate(Birthdate = mdy(Birthdate)) %>%
mutate(Age = as.numeric(floor(interval(start= Birthdate, end=Sys.Date())/duration(n=1, unit="years")))) %>%
mutate(Spouse.Birthdate = ifelse(Spouse.Birthdate == "0001-01-01", NA, Spouse.Birthdate)) %>%
mutate(Spouse.Birthdate = mdy(Spouse.Birthdate)) %>%
mutate(Spouse.Age = as.numeric(floor(interval(start= Spouse.Birthdate,
end=Sys.Date())/duration(n=1, unit="years")))) %>%
mutate(ID = as.numeric(ID)) %>%
mutate(Assignment_flag = ifelse(is.na(Assignment.Number), 0,1)) %>%
mutate( No_of_Children = ifelse(is.na(Child.1.ID),0,
ifelse(is.na(Child.2.ID),1,2)))%>%
mutate(ID = as.numeric(ID)) %>%
mutate( nmb_degree = ifelse(is.na(Degree.Type.1),0,
ifelse(is.na(Degree.Type.2),1,2)))
#conferral dates
dataclean <- dataclean %>%
mutate(Conferral.Date.1 = ifelse(Conferral.Date.1 == "0001-01-01", NA, Conferral.Date.1)) %>%
mutate(Conferral.Date.1 = mdy(Conferral.Date.1)) %>%
mutate(Conferral.Date.1.Age = as.numeric(floor(interval(start= Conferral.Date.1, end=Sys.Date())/duration(n=1, unit="years")))) %>%
mutate(Conferral.Date.2 = ifelse(Conferral.Date.2 == "0001-01-01", NA, Conferral.Date.2)) %>%
mutate(Conferral.Date.2 = mdy(Conferral.Date.2)) %>%
mutate(Conferral.Date.2.Age = as.numeric(floor(interval(start= Conferral.Date.2, end=Sys.Date())/duration(n=1, unit="years")))) %>%
mutate(Last.Contact.By.Anyone = ifelse(Last.Contact.By.Anyone == "0001-01-01", NA, Last.Contact.By.Anyone)) %>%
mutate(Last.Contact.By.Anyone = mdy(Last.Contact.By.Anyone)) %>%
mutate(Last.Contact.Age = as.numeric(floor(interval(start= Last.Contact.By.Anyone, end=Sys.Date())/duration(n=1, unit="years")))) %>%
mutate(HH.First.Gift.Date = ifelse(HH.First.Gift.Date == "0001-01-01", NA, HH.First.Gift.Date)) %>%
mutate(HH.First.Gift.Date = mdy(HH.First.Gift.Date)) %>%
mutate(HH.First.Gift.Age = as.numeric(floor(interval(start= HH.First.Gift.Date, end=Sys.Date())/duration(n=1, unit="years"))))
#major gift
dataclean <-
dataclean %>%
mutate(major_gifter = ifelse(Lifetime.Giving > 50000, 1,0) %>% factor(., levels = c("0","1")))
#splitting up the age into ranges and creating category for easy visualization
dataclean <- dataclean %>%
mutate(age_range =
ifelse(Age %in% 10:19, "10 < 20 years old",
ifelse(Age %in% 20:29, "20 < 30 years old",
ifelse(Age %in% 30:39, "30 < 40 years old",
ifelse(Age %in% 40:49, "40 < 50 years old",
ifelse(Age %in% 50:59, "50 < 60 years old",
ifelse(Age %in% 60:69, "60 < 70 years old",
ifelse(Age %in% 70:79, "70 < 80 years old",
ifelse(Age %in% 80:89, "80 < 90 years old",
ifelse(Age %in% 90:120, "90+ years old",
NA))))))))))
#seeing what we have
table(dataclean$age_range)
10 < 20 years old 20 < 30 years old 30 < 40 years old 40 < 50 years old 50 < 60 years old 60 < 70 years old
3985 24558 21037 16851 20755 18257
70 < 80 years old 80 < 90 years old 90+ years old
12246 5984 6633
#50-60 is the most common age range
#creating a region column using the county data and the OMB MSA (Metropolitan Statistical Area) definitions
dataclean <- dataclean %>%
mutate(region =
ifelse(County == "San Luis Obispo" & State == "CA", "So Cal",
ifelse(County == "Kern" & State == "CA", "So Cal",
ifelse(County == "San Bernardino" & State == "CA", "So Cal",
ifelse(County == "Santa Barbara" & State == "CA", "So Cal",
ifelse(County == "Ventura" & State == "CA", "So Cal",
ifelse(County == "Los Angeles" & State == "CA", "So Cal",
ifelse(County == "Orange" & State == "CA", "So Cal",
ifelse(County == "Riverside" & State == "CA", "So Cal",
ifelse(County == "San Diego" & State == "CA", "So Cal",
ifelse(County == "Imperial" & State == "CA", "So Cal",
ifelse(County == "King" & State == "WA", "Seattle",
ifelse(County == "Snohomish" & State == "WA", "Seattle",
ifelse(County == "Pierce" & State == "WA", "Seattle",
ifelse(County == "Clackamas" & State == "OR", "Portland",
ifelse(County == "Columbia" & State == "OR", "Portland",
ifelse(County == "Multnomah" & State == "OR", "Portland",
ifelse(County == "Washington" & State == "OR", "Portland",
ifelse(County == "Yamhill" & State == "OR", "Portland",
ifelse(County == "Clark" & State == "WA", "Portland",
ifelse(County == "Skamania" & State == "WA", "Portland",
ifelse(County == "Denver" & State == "CO", "Denver",
ifelse(County == "Arapahoe" & State == "CO", "Denver",
ifelse(County == "Jefferson" & State == "CO", "Denver",
ifelse(County == "Adams" & State == "CO", "Denver",
ifelse(County == "Douglas" & State == "CO", "Denver",
ifelse(County == "Broomfield" & State == "CO", "Denver",
ifelse(County == "Elbert" & State == "CO", "Denver",
ifelse(County == "Park" & State == "CO", "Denver",
ifelse(County == "Clear Creek" & State == "CO", "Denver",
ifelse(County == "Alameda" & State == "CA", "Bay Area",
ifelse(County == "Contra Costa" & State == "CA", "Bay Area",
ifelse(County == "Marin" & State == "CA", "Bay Area",
ifelse(County == "Monterey" & State == "CA", "Bay Area",
ifelse(County == "Napa" & State == "CA", "Bay Area",
ifelse(County == "San Benito" & State == "CA", "Bay Area",
ifelse(County == "San Francisco" & State == "CA", "Bay Area",
ifelse(County == "San Mateo" & State == "CA", "Bay Area",
ifelse(County == "Santa Clara" & State == "CA", "Bay Area",
ifelse(County == "Santa Cruz" & State == "CA", "Bay Area",
ifelse(County == "Solano" & State == "CA", "Bay Area",
ifelse(County == "Sonoma" & State == "CA", "Bay Area",
NA))))))))))))))))))))))))))))))))))))))))))
dataclean <- dataclean %>%
mutate(region =
ifelse(County == "Kings" & State == "NY", "New York",
ifelse(County == "Queens" & State == "NY", "New York",
ifelse(County == "New York" & State == "NY", "New York",
ifelse(County == "Bronx" & State == "NY", "New York",
ifelse(County == "Richmond" & State == "NY", "New York",
ifelse(County == "Westchester" & State == "NY", "New York",
ifelse(County == "Bergen" & State == "NY", "New York",
ifelse(County == "Hudson" & State == "NY", "New York",
ifelse(County == "Passaic" & State == "NY", "New York",
ifelse(County == "Putnam" & State == "NY", "New York",
ifelse(County == "Rockland" & State == "NY", "New York",
ifelse(County == "Suffolk" & State == "NY", "New York",
ifelse(County == "Nassau" & State == "NY", "New York",
ifelse(County == "Middlesex" & State == "NJ", "New York",
ifelse(County == "Monmouth" & State == "NJ", "New York",
ifelse(County == "Ocean" & State == "NJ", "New York",
ifelse(County == "Somerset" & State == "NJ", "New York",
ifelse(County == "Essex" & State == "NJ", "New York",
ifelse(County == "Union" & State == "NJ", "New York",
ifelse(County == "Morris" & State == "NJ", "New York",
ifelse(County == "Sussex" & State == "NJ", "New York",
ifelse(County == "Hunterdon" & State == "NJ", "New York",
ifelse(County == "Pike" & State == "NJ", "New York",
region))))))))))))))))))))))))
# code nor cal region as all others in CA not already defined
dataclean <- dataclean %>%
mutate(region =
ifelse(State == "CA" & is.na(region) == TRUE, "Nor Cal", region))
#Removing Columns that provide no benefit
dataclean <- subset(dataclean,select = -c(Assignment.Number
,Assignment.has.Historical.Mngr
,Suffix
,Assignment.Date
,Assignment.Manager
,Assignment.Role
,Assignment.Title
,Assignment.Status
,Strategy
,Progress.Level
,Assignment.Group
,Assignment.Category
,Funding.Method
,Expected.Book.Date
,Qualification.Amount
,Expected.Book.Amount
,Expected.Book.Date
,Hard.Gift.Total
,Soft.Credit.Total
,Total.Assignment.Gifts
,No.of.Pledges
,Proposal..
,Proposal.Notes
,HH.Life.Spouse.Credit
,Last.Contact.By.Manager
,X..of.Contacts.By.Manager
,DonorSearch.Range
,iWave.Range
,WealthEngine.Range
,Philanthropic.Commitments
))
#cleaning up zip codes removing -4 after
dataclean$Zip <- gsub(dataclean$Zip, pattern="-.*", replacement = "")
#adding zip code data and column
zip <- read.csv(here::here("final_project", "Salary_Zipcode.csv"),
stringsAsFactors = FALSE,
strip.white = TRUE,
na.strings = "")
#adding zip salary column
dataclean <-dataclean %>%
mutate(zipcode_slry = VLOOKUP(Zip, zip, NAME, S1902_C03_002E))
#slry range
dataclean <- dataclean %>%
mutate(zipslry_range =
ifelse(zipcode_slry %in% 10000:89999, "90K-99K",
ifelse(zipcode_slry %in% 90000:99999, "90K-99K",
ifelse(zipcode_slry %in% 100000:149999, "100K-149K",
ifelse(zipcode_slry %in% 150000:199999, "150K-199K",
ifelse(zipcode_slry %in% 200000:249999, "200K-249K",
ifelse(zipcode_slry %in% 250000:299999, "250K-299K",
ifelse(zipcode_slry %in% 300000:349999, "300K-349K",
ifelse(zipcode_slry %in% 350000:399999, "350K-399K",
ifelse(zipcode_slry %in% 400000:499999, "400K-499K",
ifelse(zipcode_slry %in% 500000:999999, "500K-999K",
NA)))))))))))
sum(is.na(dataclean$zipcode_slry))
[1] 62347
#adding scholarship data (y/n)
schlr <- read.csv(here::here("final_project", "scholarship.csv"),
stringsAsFactors = FALSE,
strip.white = TRUE,
na.strings = "")
#adding scholarship column
dataclean <-dataclean %>%
mutate(scholarship = VLOOKUP(ID, schlr, ID, SCHOLARSHIP))
#replacing NA with 0
dataclean$scholarship <- replace_na(dataclean$scholarship,'0')
#replacing Y with 1
dataclean$scholarship<-ifelse(dataclean$scholarship=="Y",1,0)
#checking how many are N
table(dataclean$scholarship)
0 1
295264 27962
#checking and deleting scholarship column
class(dataclean$schlr_fct)
[1] "NULL"
dataclean = subset(dataclean, select = -c(scholarship))
#checking for duplicates N >1 indicates a records values are in the file twice
dataclean %>% group_by(ID) %>% count() %>% arrange(desc(n))
#removing duplicated records
dataclean <- unique(dataclean)
#n = 1 no ID with multiple records cleaned of dupes
dataclean %>% group_by(ID) %>% count() %>% arrange(desc(n))
NA
1d Creating many many factor variables
dataclean <-
dataclean %>%
#SEX
mutate(sex_fct =
fct_explicit_na(Sex),
sex_simple =
fct_lump_n(Sex, n = 4),
#MARRIED
married_fct =
fct_explicit_na(Married),
#DONOR SEGMENT
donorseg_fct =
fct_explicit_na(Donor.Segment),
donorseg_simple =
fct_lump_n(Donor.Segment, n = 4),
#CONTACT RULE
contact_fct =
fct_explicit_na(Contact.Rules),
contact_simple =
fct_lump_n(Contact.Rules, n = 4),
#SPOUSE MAIL
spomail_fct =
fct_explicit_na(Spouse.Mail.Rules),
spomail_simple =
fct_lump_n(Spouse.Mail.Rules, n = 4),
#JOB TITLE
jobtitle_fct =
fct_explicit_na(Job.Title),
jobtitle_simple =
fct_lump_n(Job.Title, n = 5),
#DEGREE TYPE 1
deg1_fct =
fct_explicit_na(Degree.Type.1),
deg1_simple =
fct_lump_n(Degree.Type.1, n = 5),
#DEGREE TYPE 2
deg2_fct =
fct_explicit_na(Degree.Type.2),
deg2_simple =
fct_lump_n(Degree.Type.2, n = 5),
#MAJOR 1
maj1_fct =
fct_explicit_na(Major.1),
maj1_simple =
fct_lump_n(Major.1, n = 5),
#MAJOR 2
maj2_fct =
fct_explicit_na(Major.2),
maj2_simple =
fct_lump_n(Major.2, n = 5),
#MINOR 1
min1_fct =
fct_explicit_na(Minor.1),
min1_simple =
fct_lump_n(Minor.1, n = 5),
#MINOR 2
min2_fct =
fct_explicit_na(Minor.2),
min2_simple =
fct_lump_n(Minor.2, n = 5),
#SCHOOL 1
school1_fct =
fct_explicit_na(School.1),
school1_simple =
fct_lump_n(School.1, n = 5),
#SCHOOL 2
school2_fct =
fct_explicit_na(School.2),
school2_simple =
fct_lump_n(School.2, n = 5),
#INSTITUTION TYPE
insttype_fct =
fct_explicit_na(Institution.Type),
insttype_simple =
fct_lump_n(Institution.Type, n = 5),
#EXTRACURRICULAR
extra_fct =
fct_explicit_na(Extracurricular),
extra_simple =
fct_lump_n(Extracurricular, n = 5),
#HH FIRST GIFT FUND
hhfirstgift_fct =
fct_explicit_na(HH.First.Gift.Fund),
hhfirstgift_simple =
fct_lump_n(HH.First.Gift.Fund, n = 5),
#CHILD 1 ENROLL STATUS
ch1_enroll_fct =
fct_explicit_na(Child.1.Enroll.Status),
ch1_enroll_simple =
fct_lump_n(Child.1.Enroll.Status, n = 4),
#CHILD 1 MAJOR
ch1_maj_fct =
fct_explicit_na(Child.1.Major),
ch1_maj_simple =
fct_lump_n(Child.1.Major, n = 4),
#CHILD 1 MINOR
ch1_min_fct =
fct_explicit_na(Child.1.Minor),
ch1_min_simple =
fct_lump_n(Child.1.Minor, n = 4),
#CHILD 1 SCHOOL
ch1_school_fct =
fct_explicit_na(Child.1.School),
ch1_school_simple =
fct_lump_n(Child.1.School, n = 4),
#CHILD 1 FEEDER
ch1_feeder_fct =
fct_explicit_na(Child.1.Feeder.School),
ch1_feeder_simple =
fct_lump_n(Child.1.Feeder.School, n = 4),
#CHILD 2 ENROLL STATUS
ch1_enroll_fct =
fct_explicit_na(Child.2.Enroll.Status),
ch2_enroll_simple =
fct_lump_n(Child.2.Enroll.Status, n = 4),
#CHILD 2 MAJOR
ch2_maj_fct =
fct_explicit_na(Child.2.Major),
ch2_maj_simple =
fct_lump_n(Child.2.Major, n = 4),
#CHILD 2 MINOR
ch2_min_fct =
fct_explicit_na(Child.2.Minor),
ch2_min_simple =
fct_lump_n(Child.2.Minor, n = 4),
#CHILD 2 SCHOOL
ch2_school_fct =
fct_explicit_na(Child.2.School),
ch2_school_simple =
fct_lump_n(Child.2.School, n = 4),
#CHILD 2 FEEDER
ch2_feeder_fct =
fct_explicit_na(Child.2.Feeder.School),
ch2_feeder_simple =
fct_lump_n(Child.2.Feeder.School, n = 4),
)
#checking to see if its a factor
#class(dataclean$sex_fct)
#class(dataclean$donorseg_fct)
#class(dataclean$contact_fct)
#class(dataclean$spomail_fct)
#checking levels
#levels(dataclean$sex_simple)
#levels(dataclean$donorseg_simple)
#levels(dataclean$contact_simple)
#levels(dataclean$spomail_simple)
#levels(dataclean$hhfirstgift_simple)
#creating a table against Sex column
#table(dataclean$sex_fct, dataclean$sex_simple)
Region Analysis
| Region | Count | Mean HH Lifetime Giving |
|---|---|---|
| So Cal | 145139 | $5,090.84 |
| NA | 130306 | $2,040.98 |
| Bay Area | 20641 | $755.92 |
| Nor Cal | 10707 | $3,823.63 |
| Seattle | 5425 | $922.08 |
| New York | 4959 | $1,978.49 |
| Portland | 2976 | $1,098.24 |
| Denver | 2847 | $257.29 |
DonorSegment Analysis
| Donor Segment | Count | Mean HH Lifetime Giving |
|---|---|---|
| NA | 231974 | $0.00 |
| Lost Donor | 69718 | $4,954.47 |
| Lapsed Donor | 11193 | $10,069.79 |
| Current Donor | 5603 | $90,638.32 |
| Lapsing Donor | 3862 | $16,590.15 |
| At-Risk Donor | 650 | $77,143.93 |
First gift size
aq <- quantile(dataclean$HH.First.Gift.Amount, probs = c(.25,.50,.75,.9,.99), na.rm = TRUE)
aq <- as.data.frame(aq)
aq$aq <- dollar(aq$aq)
aq %>%
kable(col.names = "Quantile") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = F)
| Quantile | |
|---|---|
| 25% | $0.00 |
| 50% | $0.00 |
| 75% | $0.00 |
| 90% | $40.00 |
| 99% | $1,910.06 |
NA
NA
Consecutive giving
#consecutive years of giving
dataclean %>%
filter(Max.Consec.Fiscal.Years > 0) %>%
ggplot(aes(Max.Consec.Fiscal.Years)) + geom_histogram(fill = "#002845", bins = 20) +
theme_economist_white() +
ggtitle("Consecutive Years of Giving Distribution") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(0,120,2)) +
scale_y_continuous(breaks = seq(0,10000000,5000))
NA
NA
NA
Lifetime giving based on number of children
dataclean %>%
filter(HH.Lifetime.Giving <= 10000) %>%
filter(HH.Lifetime.Giving > 0) %>%
mutate(`No_of_Children` = as.factor(`No_of_Children`)) %>%
ggplot(aes(HH.Lifetime.Giving, fill = `No_of_Children`)) + geom_histogram(bins = 30) + theme_economist_white() +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(0,100000,1000)) +
scale_y_continuous(breaks = seq(0,100000000,5000)) +
ggtitle("Giving distribution and number of children")+
scale_fill_manual(values=c("#002845", "#00cfcc", "#ff9973"))
NA
NA
NA
Mean, Median, and Count of Giving in Age Ranges
age_range_giving <- dataclean %>%
group_by(age_range) %>%
summarise(avg_giving = mean(HH.Lifetime.Giving, na.rm = TRUE),
med_giving = median(HH.Lifetime.Giving, na.rm = TRUE),
amount_of_people_in_age_range = n())
glimpse(age_range_giving)
Rows: 10
Columns: 4
$ age_range <chr> "10 < 20 years old", "20 < 30 years old", "30 < 40 years old", "40 < 50 ye~
$ avg_giving <dbl> 0.4455282, 28.2744487, 391.1692413, 804.5846468, 2779.1345908, 5401.706827~
$ med_giving <dbl> 0, 0, 0, 0, 0, 0, 0, 10, 15, 0
$ amount_of_people_in_age_range <int> 3985, 24551, 21024, 16831, 20737, 18226, 12195, 5954, 6626, 192871
2a) Plotting average giving by age range
age_range_giving <-
age_range_giving %>%
mutate(age_range = factor(age_range))
ggplot(age_range_giving, aes(age_range, avg_giving)) +
geom_bar(stat = "identity")+
theme(axis.text.x = element_text(angle=45,
hjust=1))
NA
NA
2b) Count of donors based on age range (another way to look at it)
ggplot(dataclean,
aes(age_range)) +
geom_bar() +
theme(axis.text.x = element_text(angle=45,
hjust=1)) +
labs(title = "Count of Age Ranges", x = "", y = "")
NA
NA
2c) Boxplot of the Age Ranges Against the Lifetime Giving Amounts with a log scale applied - the reason we applied log scale is to resolve issues with visualizations that skew towards large values in our dataset.
ggplot(dataclean, aes(age_range,HH.Lifetime.Giving,fill = age_range)) +
geom_boxplot(
outlier.colour = "red") +
scale_y_log10() +
theme(axis.text.x=element_text(angle=45,hjust=1))
NA
NA
2d) Splitting by age and gender
#creating boxplots
dataclean %>%
filter(Age < 100) %>% #removing the weird outliers that are over 100
filter(Sex %in% c("M", "F")) %>%
ggplot(aes(Sex, Age)) +
geom_boxplot() +
theme_economist() +
ggtitle("Ages of Donors Based on Gender") +
xlab(NULL) + ylab(NULL)
NA
NA
Giving by gender
Mean age by gender
2e) Distribution of people in the states that they live.
dataclean %>%
mutate(State = ifelse(State == " ", "NA", State)) %>%
filter(State != "NA") %>%
group_by(State) %>%
summarise(Count = length(State)) %>%
filter(Count > 800) %>%
arrange(-Count) %>%
kable(col.names = c("Donor's State", "Count")) %>%
kable_styling(bootstrap_options = c("condensed"),
full_width = F)
| Donor's State | Count |
|---|---|
| CA | 176487 |
| WA | 7957 |
| TX | 7266 |
| NY | 5659 |
| CO | 5073 |
| AZ | 4925 |
| OR | 4612 |
| FL | 4111 |
| IL | 3681 |
| HI | 3394 |
| PA | 2904 |
| OH | 2754 |
| NV | 2715 |
| MI | 2523 |
| MA | 2473 |
| NJ | 2311 |
| VA | 2158 |
| NC | 2085 |
| GA | 2044 |
| MO | 1889 |
| MN | 1732 |
| MD | 1488 |
| TN | 1443 |
| IN | 1417 |
| CT | 1380 |
| WI | 1330 |
| UT | 1173 |
| OK | 1151 |
| AL | 1120 |
| LA | 1110 |
| ID | 1096 |
| SC | 1076 |
| KY | 1032 |
| KS | 1027 |
| NM | 981 |
| IA | 880 |
NA
NA
NA
NA
NA
NA
2f) Looking at all donors first gift amount. 75% made a first gift of <100.
no_non_donors <- dataclean %>%
filter(Lifetime.Giving != 0)
nd <- quantile(no_non_donors$HH.First.Gift.Amount, probs = c(.25,.50,.75,.9,.99), na.rm = TRUE)
nd <- as.data.frame(nd)
nd %>%
kable(col.names = "Quantile") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = F)
| Quantile | |
|---|---|
| 25% | 3.8 |
| 50% | 25.0 |
| 75% | 100.0 |
| 90% | 500.0 |
| 99% | 15000.0 |
NA
NA
NA
NA
Split data
#converting married Y and N to 1 and 0
dataclean <- dataclean %>%
mutate(Married_simple = ifelse(Married == "N",0,1))
dataclean <- dataclean %>%
mutate(hh.lifetime.giving_fct = as.factor(HH.Lifetime.Giving))
library("rsample")
data_split <- initial_split(dataclean, prop = 0.75)
data_train <- training(data_split)
data_test <- testing(data_split)
p <- dataclean %>%
ggplot(aes(Age)) + geom_histogram(bins=30, fill = "blue") + theme_economist_white() +
ggtitle("Overall Donor Age Distribution") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(5,100,by = 20)) +
scale_y_continuous(breaks = seq(20,100,by = 20)) + xlim(c(20,100))
Scale for 'x' is already present. Adding another scale for 'x', which will replace the existing scale.
ggplotly(p)
p
ggplot(data = dataclean, aes(x = Age)) + geom_histogram(fill ="blue")+ xlim(c(20,100))
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
NA
NA
NA
Another Histogram
dataclean %>%
filter(Age >= 10) %>%
filter(Age <= 90) %>%
ggplot(aes(Age)) + geom_histogram(fill = "#002845", bins = 20) + theme_economist_white() +
ggtitle("Overall Donor Age Distribution") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(0,120,5)) +
scale_y_continuous(breaks = seq(0,10000000,2000))
Age distribution by gender
#Age Gender filtered out below 15 and above 90 - also removed U X the weird values
dataclean %>%
filter(Age >= 15) %>%
filter(Age <= 90) %>%
mutate(Sex = as.factor(Sex)) %>%
filter(Sex != "U") %>%
filter(Sex != "X") %>%
ggplot(aes(Age, fill = Sex)) + geom_histogram(bins = 25) + theme_economist_white() +
ggtitle("Age Distribution by Gender") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(0,120,10)) +
scale_y_continuous(breaks = seq(0,50000,2000)) + scale_fill_manual(values=c("#ff9973", "#00cfcc"))
Donor age distribution by marital status
#Age Marital Status
dataclean %>%
filter(Age >= 20) %>%
filter(Age <= 85) %>%
ggplot(aes(Age, fill = Married)) + geom_histogram(bins = 25) + theme_economist_white() +
ggtitle("Overall Donor Age Distribution by Marital Status") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(0,120,5)) +
scale_y_continuous(breaks = seq(0,50000,2000)) + scale_fill_manual(values=c("#ff9973", "#00cfcc"))
Linear Model
ggplot(data = data_train, aes(x = Age, y = log(HH.Lifetime.Giving))) + geom_point(alpha = 1/10) + geom_smooth(method = lm) + facet_wrap(~nmb_degree) + theme_clean(base_size = 8) + labs(x = "X", y = "Y") +
ggtitle("Number of Degrees")
`geom_smooth()` using formula 'y ~ x'
MORE MODELS
Big logistic model
# Set family to binomial to set logistic function
# Run the model on the training set
donor_logit1 <-
glm(hh.lifetime.giving_fct ~ Married_simple,
family = "binomial",
data = data_train)
summary(donor_logit1)
Call:
glm(formula = hh.lifetime.giving_fct ~ Married_simple, family = "binomial",
data = data_train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.1122 -0.6872 -0.6872 1.2440 1.7659
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.323203 0.005906 -224.1 <0.0000000000000002 ***
Married_simple 1.167887 0.009628 121.3 <0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 288365 on 242249 degrees of freedom
Residual deviance: 273662 on 242248 degrees of freedom
AIC: 273666
Number of Fisher Scoring iterations: 4
donor_logit2 <-
glm(hh.lifetime.giving_fct ~ No_of_Children,
family = "binomial",
data = data_train)
summary(donor_logit2)
Call:
glm(formula = hh.lifetime.giving_fct ~ No_of_Children, family = "binomial",
data = data_train)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.9083 -0.8000 -0.8000 1.5411 1.6094
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.975131 0.005212 -187.10 <0.0000000000000002 ***
No_of_Children 0.151469 0.009049 16.74 <0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 288365 on 242249 degrees of freedom
Residual deviance: 288089 on 242248 degrees of freedom
AIC: 288093
Number of Fisher Scoring iterations: 4
#summary(data_train$major_gifter)
donor_logit3 <-
glm(major_gifter ~ Married_simple + No_of_Children + donorseg_simple + Assignment_flag + Total.Giving.Years,
family = "binomial",
data = data_train)
summary(donor_logit3)
Call:
glm(formula = major_gifter ~ Married_simple + No_of_Children +
donorseg_simple + Assignment_flag + Total.Giving.Years, family = "binomial",
data = data_train)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.4932 -0.1410 -0.1220 -0.0873 3.5040
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -4.08066 0.24408 -16.718 < 0.0000000000000002 ***
Married_simple -1.23973 0.08749 -14.170 < 0.0000000000000002 ***
No_of_Children 0.71530 0.05765 12.407 < 0.0000000000000002 ***
donorseg_simpleCurrent Donor -0.04313 0.24679 -0.175 0.8613
donorseg_simpleLapsed Donor -0.60244 0.25434 -2.369 0.0179 *
donorseg_simpleLapsing Donor -0.40470 0.26802 -1.510 0.1311
donorseg_simpleLost Donor -0.96219 0.24415 -3.941 0.0000811 ***
Assignment_flag 1.19911 0.11744 10.210 < 0.0000000000000002 ***
Total.Giving.Years 0.14559 0.00441 33.013 < 0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 9764.4 on 68404 degrees of freedom
Residual deviance: 7544.5 on 68396 degrees of freedom
(173845 observations deleted due to missingness)
AIC: 7562.5
Number of Fisher Scoring iterations: 8
exp(donor_logit3$coefficients)
(Intercept) Married_simple No_of_Children
0.01689628 0.28946282 2.04479757
donorseg_simpleCurrent Donor donorseg_simpleLapsed Donor donorseg_simpleLapsing Donor
0.95778285 0.54747539 0.66717987
donorseg_simpleLost Donor Assignment_flag Total.Giving.Years
0.38205405 3.31714962 1.15672540
#training predictions for in sample preds
preds_train <- predict(donor_logit3, newdata = data_train, type = "response")
#test predicts for OOS (out of sample)
preds_test <- predict(donor_logit3, newdata = data_test, type = "response")
head(preds_train)
196454 181024 64789 23864 86886 188620
NA 0.003806297 NA NA NA NA
head(preds_test)
1 6 17 18 21 27
0.047220650 0.764137284 0.931364886 0.001865082 0.040099014 0.188797205
results_train <- data.frame(
`truth` = data_train %>% select(major_gifter) %>%
mutate(major_gifter = as.numeric(major_gifter)),
`Class1` = preds_train,
`type` = rep("train",length(preds_train))
)
results_test <- data.frame(
`truth` = data_test %>% select(major_gifter) %>%
mutate(major_gifter = as.numeric(major_gifter)),
`Class1` = preds_test,
`type` = rep("test",length(preds_test))
)
results <- bind_rows(results_train,results_test)
dim(results_train)
[1] 242250 3
dim(results_test)
[1] 80750 3
dim(results)
[1] 323000 3
library('plotROC')
p_plot <-
ggplot(results,
aes(m = Class1, d = major_gifter, color = type)) +
geom_roc(labelsize = 2.5,
#Took the labelsize down to avoid cutoff
cutoffs.at = c(0.7,0.5,0.3,0.1,0)) +
#We removed some of the cutoffs to avoid the mashup near the origin.
#Changed the theme to avoid cutoff plot values.
theme_classic(base_size = 14) +
labs(x = "False Positive Rate",
y = "True Positive Rate") +
ggtitle("ROC Plot: Training and Test")
print(p_plot)
p_train <-
ggplot(results_train,
aes(m = Class1, d = major_gifter, color = type)) +
geom_roc(labelsize = 3.5,
cutoffs.at = c(0.7,0.5,0.3,0.1,0)) +
theme_minimal(base_size = 16) +
labs(x = "False Positive Rate",
y = "True Positive Rate") +
ggtitle("ROC Plot: Training and Test")
p_test <-
ggplot(results_test,
aes(m = Class1, d = major_gifter, color = type)) +
geom_roc(labelsize = 3.5,
cutoffs.at = c(0.7,0.5,0.3,0.1,0)) +
theme_minimal(base_size = 16) +
labs(x = "False Positive Rate",
y = "True Positive Rate") +
ggtitle("ROC Plot: Training and Test")
#Calculating AUC of both
print(calc_auc(p_train)$AUC)
[1] 0.8823867
print(calc_auc(p_test)$AUC)
[1] 0.8713205
RIDGE
library('glmnet')
library('glmnetUtils')
ridge_fit1 <- cv.glmnet(HH.Lifetime.Giving ~ sex_fct + donorseg_fct + No_of_Children,
data = data_train,
alpha = 0)
#Alpha 0 sets the Ridge
print(ridge_fit1)
Call:
cv.glmnet.formula(formula = HH.Lifetime.Giving ~ sex_fct + donorseg_fct +
No_of_Children, data = data_train, alpha = 0)
Model fitting options:
Sparse model matrix: FALSE
Use model.frame: FALSE
Number of crossvalidation folds: 10
Alpha: 0
Deviance-minimizing lambda: 1192.867 (+1 SE): 11928665
print(ridge_fit1$lambda.min)
[1] 1192.867
print(ridge_fit1$lambda.1se)
[1] 11928665
LASSO
#enet_mod <- cva.glmnet(dependent ~ indy1 + indy2,
# data = data,
# alpha = seq(0,1, by = 0.1))
#print(enet_mod)
#plot(enet_mod)
ELASTICNET
minlossplot(enet_mod,
cv.type = "min")
Error in minlossplot(enet_mod, cv.type = "min") :
object 'enet_mod' not found
Ridges plot - could be useful for plotting donations vs donor segment
ggplot(data = corrplot_data, aes(x = nmb_degree, y = HH.Lifetime.Giving)) +
geom_point(aplha = 1/10)+
geom_smooth(method = "lm", color ="red")
`geom_smooth()` using formula 'y ~ x'
Random Forest
rf_fit_donor <- randomForest(Lifetime.Giving ~ .,
data = data_train,
type = classification,
mtry = 7,
na.action = na.roughfix,
ntree = 200,
importance=TRUE
)
Error in na.roughfix.data.frame(list(Lifetime.Giving = c(0, 0, 0, 0, 0, :
na.roughfix only works for numeric or factor